Version 2023.11.21_11.24.15 - focuses on genes found only relative to TCGA, not relative to any other cohort. Previous versions focused on outliers detected relative to TCGA and not treehouse, irrespective of whatever other cohorts they were outliers in

Version 2023.11.29_09.19.33 - focuses on all druggable genes, not only genes that were outliers vs one cohort or another

Version 2023.11.30_16.40.01 - incorporates pan disease cohorts

outliers <- read_tsv("../input_data/druggable_outliers_from_treehouse_and_other_cohorts_2023_11_09-13_46_32_2023.tsv") %>%
  mutate(high_level_cohort = ifelse(str_detect(comparison_cohort, "Treehouse"),
                                    "Treehouse",
                                    comparison_cohort))
## Rows: 287 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sample_ID, comparison_cohort, gene, donor_ID
## lgl (1): pathway_support
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

COMPARE DISTRIBUTIONS FOR FOR OUTLIERS ACROSS COHORTS

outlier_genes_detected <- unique(outliers$gene)

v11_expr <- read_tsv("../input_data/druggable_TumorCompendium_v11_PolyA_hugo_log2tpm_58581genes_2020-04-09.tsv.gz") %>%
  rename(Sample_ID = TH_id) %>%
  mutate(ever_outlier_in_ckcc2 = Gene %in% outlier_genes_detected)
## Rows: 1414917 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, TH_id
## dbl (1): log2TPM1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stanford_samples  <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/TH03_TH34_rollup.sample_list.txt",
                              col_names = "Sample_ID") %>%
  mutate(cohort = "TH03_TH34")
## Rows: 110 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
TCGA_samples  <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/TCGA_rollup.sample_list.txt",
                              col_names = "Sample_ID") %>%
  mutate(cohort = "TCGA")
## Rows: 9806 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PEDAYA_samples  <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/PEDAYA_rollup.sample_list.txt",
                              col_names = "Sample_ID") %>%
  mutate(cohort = "PEDAYA")
## Rows: 2814 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pd_cohorts <- read_tsv("../input_data/pan_disease_cohort_members_2023_11_30-16_37_31_2023.tsv") %>%
  rename(original_cohort_name = cohort,
         focus_sample_ID = TH_id,
         Sample_ID = cohort_member) %>%
  mutate(
         cohort_pd_subset = str_replace(original_cohort_name,
                              "first_degree_mcs_cohort", "pan_disease_1st_degree") %>%
           str_replace("first_and_second_degree_mcs_cohort", "pan_disease_1st_and_2nd_degree") %>%
           str_replace("diagnosed_disease_cohort", "pan_disease_same_diagnosis") %>%
           str_replace("pandisease_samples", "pan_disease_same_inferred_diagnosis"),
         cohort = paste(focus_sample_ID, cohort_pd_subset))
## Rows: 69829 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): cohort, TH_id, cohort_member
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# dput(unique(pd_cohorts$cohort))
n_distinct(pd_cohorts$cohort)
## [1] 116
v8_expr <- read_tsv("../input_data/v8_expr_for_ckcc2.tsv.gz") %>%
  mutate(ever_outlier_in_ckcc2 = Gene %in% outlier_genes_detected)
## Rows: 3280536 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, Sample_ID
## dbl (1): log2TPM1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
v10_expr <- read_tsv("../input_data/v10_expr_for_ckcc2.tsv.gz") %>%
  mutate(ever_outlier_in_ckcc2 = Gene %in% outlier_genes_detected)
## Rows: 234324 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, Sample_ID
## dbl (1): log2TPM1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
expr <- bind_rows(
  v10_expr, 
  v8_expr,
  v11_expr) %>%
  distinct()

pan_cancer_samples <- expr %>%
  select(Sample_ID) %>%
  distinct() %>%
  mutate(cohort = "Treehouse_pc")


samples_in_cohorts <- bind_rows(
  stanford_samples,
  TCGA_samples,
  PEDAYA_samples,
  pan_cancer_samples,
  pd_cohorts %>%
    select(cohort, Sample_ID))


tabyl(samples_in_cohorts,
      cohort)
##                                             cohort     n      percent
##                                             PEDAYA  2814 2.950830e-02
##                                               TCGA  9806 1.028281e-01
##                                          TH03_TH34   110 1.153487e-03
##       TH34_1150_S02 pan_disease_1st_and_2nd_degree   100 1.048625e-03
##               TH34_1150_S02 pan_disease_1st_degree    52 5.452849e-04
##           TH34_1150_S02 pan_disease_same_diagnosis    77 8.074410e-04
##  TH34_1150_S02 pan_disease_same_inferred_diagnosis    77 8.074410e-04
##       TH34_1162_S01 pan_disease_1st_and_2nd_degree  1843 1.932615e-02
##               TH34_1162_S01 pan_disease_1st_degree    80 8.388998e-04
##           TH34_1162_S01 pan_disease_same_diagnosis    72 7.550098e-04
##  TH34_1162_S01 pan_disease_same_inferred_diagnosis    72 7.550098e-04
##       TH34_1163_S01 pan_disease_1st_and_2nd_degree   423 4.435683e-03
##               TH34_1163_S01 pan_disease_1st_degree    48 5.033399e-04
##           TH34_1163_S01 pan_disease_same_diagnosis    58 6.082023e-04
##  TH34_1163_S01 pan_disease_same_inferred_diagnosis    99 1.038138e-03
##           TH34_1179_S01 pan_disease_same_diagnosis   290 3.041012e-03
##       TH34_1238_S01 pan_disease_1st_and_2nd_degree   120 1.258350e-03
##               TH34_1238_S01 pan_disease_1st_degree     6 6.291748e-05
##           TH34_1238_S01 pan_disease_same_diagnosis   265 2.778856e-03
##  TH34_1238_S01 pan_disease_same_inferred_diagnosis   265 2.778856e-03
##       TH34_1239_S01 pan_disease_1st_and_2nd_degree   582 6.102996e-03
##               TH34_1239_S01 pan_disease_1st_degree    96 1.006680e-03
##           TH34_1239_S01 pan_disease_same_diagnosis   420 4.404224e-03
##  TH34_1239_S01 pan_disease_same_inferred_diagnosis   687 7.204052e-03
##       TH34_1240_S01 pan_disease_1st_and_2nd_degree    63 6.606336e-04
##               TH34_1240_S01 pan_disease_1st_degree     2 2.097249e-05
##           TH34_1240_S01 pan_disease_same_diagnosis    73 7.654961e-04
##  TH34_1240_S01 pan_disease_same_inferred_diagnosis    73 7.654961e-04
##           TH34_1349_S01 pan_disease_same_diagnosis   730 7.654961e-03
##           TH34_1349_S02 pan_disease_same_diagnosis   730 7.654961e-03
##       TH34_1350_S01 pan_disease_1st_and_2nd_degree   620 6.501473e-03
##               TH34_1350_S01 pan_disease_1st_degree    92 9.647348e-04
##           TH34_1350_S01 pan_disease_same_diagnosis   422 4.425196e-03
##  TH34_1350_S01 pan_disease_same_inferred_diagnosis   698 7.319401e-03
##       TH34_1351_S01 pan_disease_1st_and_2nd_degree  5470 5.735977e-02
##               TH34_1351_S01 pan_disease_1st_degree   137 1.436616e-03
##           TH34_1351_S01 pan_disease_same_diagnosis   151 1.583423e-03
##  TH34_1351_S01 pan_disease_same_inferred_diagnosis   151 1.583423e-03
##       TH34_1352_S01 pan_disease_1st_and_2nd_degree   339 3.554838e-03
##               TH34_1352_S01 pan_disease_1st_degree     3 3.145874e-05
##  TH34_1352_S01 pan_disease_same_inferred_diagnosis     4 4.194499e-05
##       TH34_1379_S01 pan_disease_1st_and_2nd_degree  4777 5.009280e-02
##               TH34_1379_S01 pan_disease_1st_degree    45 4.718811e-04
##           TH34_1379_S01 pan_disease_same_diagnosis   666 6.983841e-03
##  TH34_1379_S01 pan_disease_same_inferred_diagnosis  1025 1.074840e-02
##       TH34_1380_S01 pan_disease_1st_and_2nd_degree   157 1.646341e-03
##               TH34_1380_S01 pan_disease_1st_degree     8 8.388998e-05
##           TH34_1380_S01 pan_disease_same_diagnosis    45 4.718811e-04
##  TH34_1380_S01 pan_disease_same_inferred_diagnosis    45 4.718811e-04
##       TH34_1381_S01 pan_disease_1st_and_2nd_degree   866 9.081090e-03
##               TH34_1381_S01 pan_disease_1st_degree    27 2.831287e-04
##           TH34_1381_S01 pan_disease_same_diagnosis    27 2.831287e-04
##  TH34_1381_S01 pan_disease_same_inferred_diagnosis   838 8.787475e-03
##       TH34_1399_S01 pan_disease_1st_and_2nd_degree  1371 1.437665e-02
##               TH34_1399_S01 pan_disease_1st_degree    12 1.258350e-04
##           TH34_1399_S01 pan_disease_same_diagnosis   316 3.313654e-03
##  TH34_1399_S01 pan_disease_same_inferred_diagnosis    92 9.647348e-04
##       TH34_1400_S01 pan_disease_1st_and_2nd_degree  3486 3.655506e-02
##               TH34_1400_S01 pan_disease_1st_degree    28 2.936149e-04
##           TH34_1400_S01 pan_disease_same_diagnosis    50 5.243124e-04
##  TH34_1400_S01 pan_disease_same_inferred_diagnosis   302 3.166847e-03
##       TH34_1412_S01 pan_disease_1st_and_2nd_degree     5 5.243124e-05
##               TH34_1412_S01 pan_disease_1st_degree     4 4.194499e-05
##           TH34_1412_S01 pan_disease_same_diagnosis   316 3.313654e-03
##  TH34_1412_S01 pan_disease_same_inferred_diagnosis     5 5.243124e-05
##       TH34_1414_S01 pan_disease_1st_and_2nd_degree   183 1.918983e-03
##               TH34_1414_S01 pan_disease_1st_degree    33 3.460462e-04
##           TH34_1414_S01 pan_disease_same_diagnosis    42 4.404224e-04
##  TH34_1414_S01 pan_disease_same_inferred_diagnosis    42 4.404224e-04
##       TH34_1415_S01 pan_disease_1st_and_2nd_degree  5605 5.877542e-02
##               TH34_1415_S01 pan_disease_1st_degree    65 6.816061e-04
##           TH34_1415_S01 pan_disease_same_diagnosis   324 3.397544e-03
##  TH34_1415_S01 pan_disease_same_inferred_diagnosis  1101 1.154536e-02
##       TH34_1444_S01 pan_disease_1st_and_2nd_degree   892 9.353733e-03
##               TH34_1444_S01 pan_disease_1st_degree    69 7.235511e-04
##  TH34_1444_S01 pan_disease_same_inferred_diagnosis   514 5.389931e-03
##       TH34_1445_S02 pan_disease_1st_and_2nd_degree  1342 1.407254e-02
##               TH34_1445_S02 pan_disease_1st_degree   558 5.851326e-03
##           TH34_1445_S02 pan_disease_same_diagnosis   798 8.368025e-03
##  TH34_1445_S02 pan_disease_same_inferred_diagnosis   798 8.368025e-03
##       TH34_1446_S01 pan_disease_1st_and_2nd_degree   886 9.290815e-03
##               TH34_1446_S01 pan_disease_1st_degree    62 6.501473e-04
##           TH34_1446_S01 pan_disease_same_diagnosis   798 8.368025e-03
##  TH34_1446_S01 pan_disease_same_inferred_diagnosis   799 8.378512e-03
##       TH34_1447_S01 pan_disease_1st_and_2nd_degree   997 1.045479e-02
##               TH34_1447_S01 pan_disease_1st_degree     6 6.291748e-05
##           TH34_1447_S01 pan_disease_same_diagnosis   364 3.816994e-03
##  TH34_1447_S01 pan_disease_same_inferred_diagnosis   823 8.630182e-03
##           TH34_1447_S02 pan_disease_same_diagnosis   364 3.816994e-03
##       TH34_1452_S01 pan_disease_1st_and_2nd_degree   234 2.453782e-03
##               TH34_1452_S01 pan_disease_1st_degree     2 2.097249e-05
##           TH34_1452_S01 pan_disease_same_diagnosis    21 2.202112e-04
##  TH34_1452_S01 pan_disease_same_inferred_diagnosis    38 3.984774e-04
##       TH34_1455_S01 pan_disease_1st_and_2nd_degree   433 4.540545e-03
##               TH34_1455_S01 pan_disease_1st_degree     2 2.097249e-05
##           TH34_1455_S01 pan_disease_same_diagnosis    21 2.202112e-04
##  TH34_1455_S01 pan_disease_same_inferred_diagnosis   602 6.312721e-03
##       TH34_1456_S02 pan_disease_1st_and_2nd_degree  2845 2.983337e-02
##               TH34_1456_S02 pan_disease_1st_degree    33 3.460462e-04
##           TH34_1456_S02 pan_disease_same_diagnosis   156 1.635855e-03
##  TH34_1456_S02 pan_disease_same_inferred_diagnosis  1170 1.226891e-02
##           TH34_2292_S01 pan_disease_same_diagnosis   415 4.351793e-03
##       TH34_2293_S01 pan_disease_1st_and_2nd_degree  4586 4.808993e-02
##               TH34_2293_S01 pan_disease_1st_degree    33 3.460462e-04
##           TH34_2293_S01 pan_disease_same_diagnosis   415 4.351793e-03
##  TH34_2293_S01 pan_disease_same_inferred_diagnosis   325 3.408030e-03
##       TH34_2351_S01 pan_disease_1st_and_2nd_degree  1725 1.808878e-02
##               TH34_2351_S01 pan_disease_1st_degree    25 2.621562e-04
##           TH34_2351_S01 pan_disease_same_diagnosis    50 5.243124e-04
##  TH34_2351_S01 pan_disease_same_inferred_diagnosis   107 1.122028e-03
##       TH34_2410_S01 pan_disease_1st_and_2nd_degree  3818 4.003649e-02
##               TH34_2410_S01 pan_disease_1st_degree    35 3.670187e-04
##           TH34_2410_S01 pan_disease_same_diagnosis   439 4.603463e-03
##  TH34_2410_S01 pan_disease_same_inferred_diagnosis   337 3.533865e-03
##       TH34_2411_S01 pan_disease_1st_and_2nd_degree  3420 3.586297e-02
##               TH34_2411_S01 pan_disease_1st_degree   116 1.216405e-03
##           TH34_2411_S01 pan_disease_same_diagnosis   188 1.971414e-03
##  TH34_2411_S01 pan_disease_same_inferred_diagnosis   327 3.429003e-03
##           TH34_2666_S01 pan_disease_same_diagnosis   443 4.645408e-03
##                                       Treehouse_pc 12804 1.342659e-01

expression in samples not in the compendium

rsem_path <- "../input_data/non_compendium_expression"

gene_name_conversion <- read_tsv(file.path(rsem_path,
                                           "EnsGeneID_Hugo_Observed_Conversions.txt"))
## Rows: 60498 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): HugoID, EnsGeneID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
relevant_gene_name_conversion <- gene_name_conversion %>%
  filter(HugoID %in% outlier_genes_detected)

rsem_kitchen_sink_data <- tibble(file_name = list.files(
  path = rsem_path,
  pattern = "_rsem_genes.results")) %>%
  rowwise() %>%
  mutate(rsem_raw = list(read_tsv(file.path(rsem_path, file_name),
                                     show_col_types = FALSE
                                     ))) %>%
  unnest(rsem_raw) %>%
  filter(gene_id %in% relevant_gene_name_conversion$EnsGeneID) %>%
  mutate(Sample_ID = str_extract(file_name, "TH[R]?[0-9]{2}_[0-9]{4}_S[0-9]{2}")) %>%
  left_join(relevant_gene_name_conversion, 
            by=c("gene_id"="EnsGeneID")) %>%
    group_by(Sample_ID, HugoID) %>%
    summarize(sum_TPM = sum(TPM),
              n=n()) %>%
    mutate(log2TPM1 = log2(sum_TPM +1))
## `summarise()` has grouped output by 'Sample_ID'. You can override using the
## `.groups` argument.
table(rsem_kitchen_sink_data$n)
## 
##   1   2 
## 275   5
patient_expression_from_rsem_files <- rsem_kitchen_sink_data %>%
  select(gene = HugoID,
         log2TPM1,
         Sample_ID)

patient_expression_from_compendia <- outliers %>%
  select(Sample_ID, gene) %>%
  distinct() %>%
  left_join(expr, 
            by=c("Sample_ID", "gene"="Gene")) %>%
  na.omit() # excludes samples not in compendium

patient_expression <- bind_rows(
  patient_expression_from_rsem_files,
  patient_expression_from_compendia)
  
length(outlier_genes_detected)
## [1] 56

Determine where to find additional expression data

# v8 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/TreehousePEDv8_clinical_metadata.2018-07-25.tsv")
# v9 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/TreehousePEDv9_clinical_metadata.2019-03-15.tsv")
# v10 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/clinical_TumorCompendium_v10_PolyA_2020-01-28.ts
# v")
# 
# samples_in_cohorts %>%
#   filter(cohort == "TH34_1351_S01 pan_disease_1st_and_2nd_degree")
# 
# samples_in_cohorts %>%
#   filter(! Sample_ID %in% expr$Sample_ID) %>%
#   select(Sample_ID) %>%
#   distinct()

# samples_in_cohorts %>%
#   filter(! Sample_ID %in% expr$Sample_ID) %>%
#   select(Sample_ID) %>%
#   distinct() %>%
#   pull(Sample_ID) %>%
#   cat(sep = " ")
# 
# samples_in_cohorts %>%
#   filter(! Sample_ID %in% expr$Sample_ID) %>%
#   filter(! Sample_ID %in% v8$th_sampleid) %>%
#   filter(! Sample_ID %in% v10$th_sampleid) %>%
#   select(Sample_ID) %>%
#   distinct()

# Conclusion
# all but TH34_1456_S02 are in v8
# TH34_1456_S02 is not in v9
# TH34_1456_S02 is  in v10
# are all in v10? no. so add data from v8 and v10

retrieve additiona expression data

servers were going very slowly, so i got v8 and v10 on different servers (razzmatazz and crimson)

setwd("/scratch/hbeale/")
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
library(tidyverse)
v8 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/TreehousePEDv8_unique_hugo_log2_tpm_plus_1.2018-07-25.tsv")
v10 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/TumorCompendium_v10_PolyA_hugo_log2tpm_58581genes_2019-07-25.tsv")

samples_with_expr_to_retrieve <- "TH03_0296_S05 THR14_0307_S01 THR14_1181_S01 THR14_1182_S01 THR14_1185_S01 THR14_1196_S01 THR14_1197_S01 THR14_1203_S01 THR14_1213_S01 THR14_1220_S01 THR14_1229_S01 THR14_1230_S01 THR14_1231_S01 THR14_1232_S01 THR14_1233_S01 THR14_1234_S01 THR14_1235_S01 THR14_1236_S01 THR33_1131_S01 THR33_1139_S01 THR14_1180_S01 THR14_1183_S01 THR14_1184_S01 THR14_1186_S01 THR14_1188_S01 THR14_1189_S01 THR14_1190_S01 THR14_1191_S01 THR14_1192_S01 THR14_1194_S01 THR14_1195_S01 THR14_1198_S01 THR14_1199_S01 THR14_1200_S01 THR14_1201_S01 THR14_1202_S01 THR14_1204_S01 THR14_1205_S01 THR14_1206_S01 THR14_1207_S01 THR14_1209_S01 THR14_1211_S01 THR14_1212_S01 THR14_1214_S01 THR14_1215_S01 THR14_1216_S01 THR14_1217_S01 THR14_1218_S01 THR14_1219_S01 THR14_1221_S01 THR14_1222_S01 THR14_1223_S01 THR14_1237_S01 THR35_1254_S01 THR14_1208_S01 THR14_1210_S01 TH34_1456_S02" %>%
  str_split(" ") %>% unlist()

v8_relevant <- v8 %>% select(Gene, any_of(samples_with_expr_to_retrieve))

v8_relevant_longer <- pivot_longer(v8_relevant, 
             -Gene, 
             names_to = "Sample_ID",
             values_to = "log2TPM1")

write_tsv(v8_relevant_longer, "v8_expr_for_ckcc2.tsv.gz")

v10_relevant <- v10 %>% select(Gene, any_of(samples_with_expr_to_retrieve))
             
v10_relevant_longer <- pivot_longer(v10_relevant, 
             -Gene, 
             names_to = "Sample_ID",
             values_to = "log2TPM1")

write_tsv(v10_relevant_longer, "v10_expr_for_ckcc2.tsv.gz")

Calculate statistics for each cohort

cohort_thresholds_raw <- left_join(samples_in_cohorts,
                                   expr,
                                   by=c("Sample_ID")) %>%
  na.omit() %>% # while I'm missing expression data for some samples
  group_by(Gene, cohort) %>%
  summarize(q25 = quantile(log2TPM1, 0.25),
            median = median(log2TPM1),
            q75 = quantile(log2TPM1, 0.75),
            IQR = q75-q25,
            up_outlier_threshold = q75 + (1.5*IQR))
## `summarise()` has grouped output by 'Gene'. You can override using the
## `.groups` argument.

assess changes for all genes

cohort_thresholds <- cohort_thresholds_raw %>%
  pivot_longer(c(-Gene, -cohort),
               names_to = "stat") %>%
  pivot_wider(names_from = cohort, values_from = value) %>%
  mutate(frac_change_in_ped_relative_to_TCGA = 
              (PEDAYA - TCGA) / TCGA,
         frac_change_in_treehouse_relative_to_TCGA = 
            (Treehouse_pc - TCGA) / Treehouse_pc)

Review FGFR3 for TH34_1455_S01

this_sample_id <- "TH34_1455_S01"
this_gene <- "FGFR3"

this_data <- cohort_thresholds %>%
  pivot_longer(c(-Gene, -stat), 
               names_to = "cohort") %>%
  filter(Gene == this_gene,
         str_detect(cohort, this_sample_id) | 
           cohort %in% c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc"))

ggplot(this_data) +
  geom_point(aes(x=value, y = stat, color = cohort))

expr_relevant_to_this_outlier <-  left_join(samples_in_cohorts %>%
  filter(str_detect(cohort, this_sample_id) | 
           cohort %in% c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc")),
                                   expr %>%
    filter(Gene == this_gene),
                                   by=c("Sample_ID")) %>%
  na.omit() %>%  # while I'm missing expression data for some samples
  mutate(cohort = as.factor(cohort) %>%
           fct_relevel(c("Treehouse_pc", "TCGA", "PEDAYA", "TH03_TH34",
                         paste(this_sample_id, c("pan_disease_same_diagnosis",
                                                 "pan_disease_same_inferred_diagnosis",
                                                 "pan_disease_1st_degree",
                                                 "pan_disease_1st_and_2nd_degree"))))) %>%
  group_by(cohort) %>%
  mutate(cohort_with_n = paste0(cohort, " (n=", n(), ")")) %>%
  ungroup %>%
  arrange(cohort) %>%
  mutate(cohort_with_n = factor(cohort_with_n, 
                                levels = unique(cohort_with_n)))
  

dput(unique(expr_relevant_to_this_outlier$cohort))
## structure(1:8, levels = c("Treehouse_pc", "TCGA", "PEDAYA", "TH03_TH34", 
## "TH34_1455_S01 pan_disease_same_diagnosis", "TH34_1455_S01 pan_disease_same_inferred_diagnosis", 
## "TH34_1455_S01 pan_disease_1st_degree", "TH34_1455_S01 pan_disease_1st_and_2nd_degree"
## ), class = "factor")
ggplot(expr_relevant_to_this_outlier) +
  geom_boxplot(aes(y=cohort_with_n, x=log2TPM1),
                   #fill = cohort_with_n),
               outlier.shape = NA) +
  geom_vline(data = expr %>%
               filter(Gene == this_gene,
                      Sample_ID == this_sample_id),
             aes(xintercept = log2TPM1), 
             color = "red") +
  facet_col(~cohort_with_n, scales = "free_y") +
 theme(axis.text.y = element_blank(),
       axis.title.y = element_blank(),
       legend.position="none") +
  ggtitle(paste(this_gene, "cohorts for", this_sample_id)) 

# Review all sample-outlier pairs

outliers_to_plot <- outliers %>%
  group_by(gene, Sample_ID) %>%
  summarize(outlier_relative_to = paste(comparison_cohort, collapse = ", "))
## `summarise()` has grouped output by 'gene'. You can override using the
## `.groups` argument.
plot_distributions <- function(this_gene, this_sample_id, outlier_relative_to) {
  #this_sample_id <- "TH34_1455_S01"
  # this_gene <- "FGFR3"
  
  this_data <- cohort_thresholds %>%
    pivot_longer(c(-Gene, -stat), 
                 names_to = "cohort") %>%
    filter(Gene == this_gene,
           str_detect(cohort, this_sample_id) | 
             cohort %in% c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc"))
  
  expected_levels <- c("Treehouse_pc", "TCGA", "PEDAYA", "TH03_TH34",
                       paste(this_sample_id, c("pan_disease_same_diagnosis",
                                               "pan_disease_same_inferred_diagnosis",
                                               "pan_disease_1st_degree",
                                               "pan_disease_1st_and_2nd_degree")))
  
  expr_relevant_to_this_outlier <-  left_join(samples_in_cohorts %>%
                                                filter(str_detect(cohort, this_sample_id) | 
                                                         cohort %in% c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc")),
                                              expr %>%
                                                filter(Gene == this_gene),
                                              by=c("Sample_ID")) %>%
    na.omit() %>%  # while I'm missing expression data for some samples
    mutate(cohort = as.factor(cohort) %>%
             fct_relevel(expected_levels[expected_levels %in% cohort])) %>%
    group_by(cohort) %>%
    mutate(cohort_with_n = paste0(cohort, " (n=", n(), ")")) %>%
    ungroup %>%
    arrange(cohort) %>%
    mutate(cohort_with_n = factor(cohort_with_n, 
                                  levels = unique(cohort_with_n)))
  
  
  p <- ggplot(expr_relevant_to_this_outlier) +
    geom_boxplot(aes(y=cohort_with_n, x=log2TPM1),
                 #fill = cohort_with_n),
                 outlier.shape = NA) +
    geom_vline(data = expr %>%
                 filter(Gene == this_gene,
                        Sample_ID == this_sample_id),
               aes(xintercept = log2TPM1), 
               color = "red") +
    facet_col(~cohort_with_n, scales = "free_y") +
    theme(axis.text.y = element_blank(),
          axis.title.y = element_blank(),
          legend.position="none") +
    ggtitle(paste(this_gene, "cohorts for", this_sample_id, ";", outlier_relative_to)) 
  
  return(p)
}

pmap(list(outliers_to_plot$gene, outliers_to_plot$Sample_ID, outliers_to_plot$outlier_relative_to), plot_distributions)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

## 
## [[58]]

## 
## [[59]]

## 
## [[60]]

## 
## [[61]]

## 
## [[62]]

## 
## [[63]]

## 
## [[64]]

## 
## [[65]]

## 
## [[66]]

## 
## [[67]]

## 
## [[68]]

## 
## [[69]]

## 
## [[70]]

## 
## [[71]]

## 
## [[72]]

## 
## [[73]]

## 
## [[74]]

## 
## [[75]]

## 
## [[76]]

## 
## [[77]]

## 
## [[78]]

## 
## [[79]]

## 
## [[80]]

## 
## [[81]]

## 
## [[82]]

## 
## [[83]]

## 
## [[84]]

## 
## [[85]]

## 
## [[86]]

## 
## [[87]]

## 
## [[88]]

## 
## [[89]]

## 
## [[90]]

## 
## [[91]]

## 
## [[92]]

## 
## [[93]]

## 
## [[94]]

## 
## [[95]]

## 
## [[96]]

## 
## [[97]]

## 
## [[98]]

## 
## [[99]]

## 
## [[100]]

## 
## [[101]]

## 
## [[102]]

## 
## [[103]]

## 
## [[104]]

## 
## [[105]]

## 
## [[106]]

## 
## [[107]]

## 
## [[108]]

## 
## [[109]]

## 
## [[110]]

## 
## [[111]]

## 
## [[112]]

## 
## [[113]]

## 
## [[114]]

## 
## [[115]]

## 
## [[116]]

## 
## [[117]]

## 
## [[118]]

## 
## [[119]]

## 
## [[120]]

## 
## [[121]]

## 
## [[122]]

## 
## [[123]]

## 
## [[124]]

## 
## [[125]]

## 
## [[126]]

## 
## [[127]]

## 
## [[128]]

## 
## [[129]]

## 
## [[130]]